import pandas as pd
import numpy as np
pd.read_csv("tdata.csv",nrows = 10)
| Trip ID | Trip Start Timestamp | Trip End Timestamp | Trip Seconds | Trip Miles | Pickup Census Tract | Dropoff Census Tract | Pickup Community Area | Dropoff Community Area | Fare | ... | Additional Charges | Trip Total | Shared Trip Authorized | Trips Pooled | Pickup Centroid Latitude | Pickup Centroid Longitude | Pickup Centroid Location | Dropoff Centroid Latitude | Dropoff Centroid Longitude | Dropoff Centroid Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | c4a636bddc6b0cc53d20b2cc5635ff5f0b4a8141 | 01/01/2020 12:00:00 AM | 01/01/2020 12:00:00 AM | 721 | 3.8 | NaN | 1.703198e+10 | NaN | 76.0 | 15.0 | ... | 7.85 | 27.85 | False | 1 | NaN | NaN | NaN | 41.979071 | -87.903040 | POINT (-87.9030396611 41.9790708201) |
| 1 | 000f3eb1fc6020bfb8a7daf0441589d314aac546 | 01/01/2020 12:00:00 AM | 01/01/2020 01:45:00 AM | 6399 | 89.0 | NaN | 1.703123e+10 | NaN | 23.0 | 95.0 | ... | 3.35 | 98.35 | False | 1 | NaN | NaN | NaN | 41.899062 | -87.721316 | POINT (-87.7213158985 41.8990616211) |
| 2 | 8af67df7ca52185b15dc4fa29f2d972448fda731 | 01/01/2020 12:00:00 AM | 01/01/2020 12:15:00 AM | 1027 | 8.3 | NaN | NaN | 15.0 | 77.0 | 12.5 | ... | 2.55 | 15.05 | False | 1 | 41.954028 | -87.763399 | POINT (-87.7633990316 41.9540276487) | 41.986712 | -87.663416 | POINT (-87.6634164054 41.9867117999) |
| 3 | 8be05cd4d7bf1997d306fc21dd4c37b9f9413558 | 01/01/2020 12:00:00 AM | 01/01/2020 12:00:00 AM | 384 | 2.0 | NaN | NaN | 28.0 | 28.0 | 5.0 | ... | 2.55 | 10.55 | False | 1 | 41.874005 | -87.663518 | POINT (-87.6635175498 41.874005383) | 41.874005 | -87.663518 | POINT (-87.6635175498 41.874005383) |
| 4 | 8e7e77c3dd22422740402143aa88912521922ee2 | 01/01/2020 12:00:00 AM | 01/01/2020 12:15:00 AM | 553 | 2.4 | NaN | NaN | 9.0 | 10.0 | 7.5 | ... | 2.55 | 13.05 | False | 1 | 42.007613 | -87.813781 | POINT (-87.8137810343 42.0076125931) | 41.985015 | -87.804532 | POINT (-87.8045320063 41.9850151008) |
| 5 | 003454de9b0cb5a3d5a1723d40438f07b574749d | 01/01/2020 12:00:00 AM | 01/01/2020 12:15:00 AM | 884 | 4.4 | 1.703101e+10 | 1.703104e+10 | 1.0 | 4.0 | 10.0 | ... | 2.55 | 13.55 | False | 1 | 42.015934 | -87.666536 | POINT (-87.6665362779 42.0159343756) | 41.972563 | -87.678846 | POINT (-87.6788459662 41.9725625375) |
| 6 | 9343d1e5f13822780e7f8df61463401b90a576ab | 01/01/2020 12:00:00 AM | 01/01/2020 12:30:00 AM | 1134 | 4.7 | NaN | NaN | 30.0 | 30.0 | 10.0 | ... | 2.55 | 12.55 | False | 1 | 41.839087 | -87.714004 | POINT (-87.714003807 41.8390869059) | 41.839087 | -87.714004 | POINT (-87.714003807 41.8390869059) |
| 7 | 94314539302831a4a82eb35789f22644b006201b | 01/01/2020 12:00:00 AM | 01/01/2020 12:45:00 AM | 2559 | 18.9 | NaN | NaN | 20.0 | 43.0 | 17.5 | ... | 2.55 | 20.05 | True | 2 | 41.924347 | -87.734740 | POINT (-87.7347397536 41.9243470769) | 41.761578 | -87.572782 | POINT (-87.5727819867 41.7615779081) |
| 8 | 005827e8adc1c35df898021c18fb9fd3e4f1f5cb | 01/01/2020 12:00:00 AM | 01/01/2020 12:15:00 AM | 977 | 12.4 | NaN | NaN | 32.0 | NaN | 17.5 | ... | 2.55 | 20.05 | False | 1 | 41.878866 | -87.625192 | POINT (-87.6251921424 41.8788655841) | NaN | NaN | NaN |
| 9 | 005d727d45694054ff3502870af8826c255941a5 | 01/01/2020 12:00:00 AM | 01/01/2020 12:15:00 AM | 1205 | 11.6 | NaN | NaN | 69.0 | 27.0 | 17.5 | ... | 2.55 | 20.05 | False | 1 | 41.763247 | -87.616134 | POINT (-87.6161341112 41.7632467988) | 41.878914 | -87.705897 | POINT (-87.7058971305 41.8789144956) |
10 rows × 21 columns
pd.read_csv("tdata.csv",nrows = 10).info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10 entries, 0 to 9 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Trip ID 10 non-null object 1 Trip Start Timestamp 10 non-null object 2 Trip End Timestamp 10 non-null object 3 Trip Seconds 10 non-null int64 4 Trip Miles 10 non-null float64 5 Pickup Census Tract 1 non-null float64 6 Dropoff Census Tract 3 non-null float64 7 Pickup Community Area 8 non-null float64 8 Dropoff Community Area 9 non-null float64 9 Fare 10 non-null float64 10 Tip 10 non-null int64 11 Additional Charges 10 non-null float64 12 Trip Total 10 non-null float64 13 Shared Trip Authorized 10 non-null bool 14 Trips Pooled 10 non-null int64 15 Pickup Centroid Latitude 8 non-null float64 16 Pickup Centroid Longitude 8 non-null float64 17 Pickup Centroid Location 8 non-null object 18 Dropoff Centroid Latitude 9 non-null float64 19 Dropoff Centroid Longitude 9 non-null float64 20 Dropoff Centroid Location 9 non-null object dtypes: bool(1), float64(12), int64(3), object(5) memory usage: 1.7+ KB
year_data = pd.read_csv("tdata.csv", usecols={"Pickup Census Tract","Trip Miles","Trip Seconds"})
year_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 49871385 entries, 0 to 49871384 Data columns (total 3 columns): # Column Dtype --- ------ ----- 0 Trip Seconds float64 1 Trip Miles float64 2 Pickup Census Tract float64 dtypes: float64(3) memory usage: 1.1 GB
year_data.head(10)
| Trip Seconds | Trip Miles | Pickup Census Tract | |
|---|---|---|---|
| 0 | 721.0 | 3.8 | NaN |
| 1 | 6399.0 | 89.0 | NaN |
| 2 | 1027.0 | 8.3 | NaN |
| 3 | 384.0 | 2.0 | NaN |
| 4 | 553.0 | 2.4 | NaN |
| 5 | 884.0 | 4.4 | 1.703101e+10 |
| 6 | 1134.0 | 4.7 | NaN |
| 7 | 2559.0 | 18.9 | NaN |
| 8 | 977.0 | 12.4 | NaN |
| 9 | 1205.0 | 11.6 | NaN |
year_data = year_data.rename(columns={"Trip Seconds":"Trip_Seconds","Trip Miles":"Trip_Miles","Pickup Census Tract":"GEOID"})
year_data = year_data.dropna()
year_data
| Trip_Seconds | Trip_Miles | GEOID | |
|---|---|---|---|
| 5 | 884.0 | 4.4 | 1.703101e+10 |
| 10 | 759.0 | 4.0 | 1.703183e+10 |
| 11 | 248.0 | 1.3 | 1.703106e+10 |
| 12 | 276.0 | 1.1 | 1.703183e+10 |
| 14 | 513.0 | 2.8 | 1.703103e+10 |
| ... | ... | ... | ... |
| 49871368 | 1258.0 | 6.5 | 1.703103e+10 |
| 49871369 | 919.0 | 4.6 | 1.703183e+10 |
| 49871370 | 454.0 | 1.6 | 1.703107e+10 |
| 49871382 | 575.0 | 2.2 | 1.703108e+10 |
| 49871384 | 788.0 | 3.7 | 1.703107e+10 |
25487623 rows × 3 columns
year_data = year_data.astype({"GEOID":int})
year_data.head()
| Trip_Seconds | Trip_Miles | GEOID | |
|---|---|---|---|
| 5 | 884.0 | 4.4 | 17031010300 |
| 10 | 759.0 | 4.0 | 17031832600 |
| 11 | 248.0 | 1.3 | 17031063302 |
| 12 | 276.0 | 1.1 | 17031832200 |
| 14 | 513.0 | 2.8 | 17031030701 |
year_data = year_data[["GEOID","Trip_Miles","Trip_Seconds"]]
year_data.head()
| GEOID | Trip_Miles | Trip_Seconds | |
|---|---|---|---|
| 5 | 17031010300 | 4.4 | 884.0 |
| 10 | 17031832600 | 4.0 | 759.0 |
| 11 | 17031063302 | 1.3 | 248.0 |
| 12 | 17031832200 | 1.1 | 276.0 |
| 14 | 17031030701 | 2.8 | 513.0 |
year_data.to_csv("geoid-miles-seconds.csv")
year_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 25487623 entries, 5 to 49871384 Data columns (total 3 columns): # Column Dtype --- ------ ----- 0 GEOID int64 1 Trip_Miles float64 2 Trip_Seconds float64 dtypes: float64(2), int64(1) memory usage: 777.8 MB
trip_info = year_data.groupby("GEOID").mean().reset_index()
trip_info
| GEOID | Trip_Miles | Trip_Seconds | |
|---|---|---|---|
| 0 | 17031010100 | 7.179774 | 1180.483869 |
| 1 | 17031010201 | 6.450781 | 1080.175410 |
| 2 | 17031010202 | 4.742800 | 882.345516 |
| 3 | 17031010300 | 6.869025 | 1144.760704 |
| 4 | 17031010400 | 7.030327 | 1185.853686 |
| ... | ... | ... | ... |
| 1135 | 17031843700 | 5.527061 | 1057.733495 |
| 1136 | 17031843800 | 7.151539 | 1058.850033 |
| 1137 | 17031843900 | 7.479913 | 1096.066109 |
| 1138 | 17031980000 | 19.692362 | 1904.696142 |
| 1139 | 17031980100 | 16.828538 | 1884.554266 |
1140 rows × 3 columns
trip_info.to_csv("trip-info.csv")
pickups = year_data["GEOID"].value_counts().reset_index()
pickups.columns = ["GEOID","Pickups"]
pickups
| GEOID | Pickups | |
|---|---|---|
| 0 | 17031839100 | 1163394 |
| 1 | 17031980000 | 1046728 |
| 2 | 17031320100 | 746423 |
| 3 | 17031833000 | 739580 |
| 4 | 17031081700 | 712078 |
| ... | ... | ... |
| 1135 | 17031826902 | 1 |
| 1136 | 17031822000 | 1 |
| 1137 | 17031801608 | 1 |
| 1138 | 17031823605 | 1 |
| 1139 | 17031823903 | 1 |
1140 rows × 2 columns
import folium
import geopandas as gpd
geodata = gpd.read_file("shapes.geojson")
geodata = geodata[geodata["GEOID"].notna()]
geodata = geodata.astype({"GEOID":"int"})
geodata = geodata[geodata["GEOID"].isin(pickups["GEOID"])]
geodata = geodata.to_crs(epsg = 4326)
geodata.head()
| STATEFP | COUNTYFP | TRACTCE | AFFGEOID | GEOID | NAME | LSAD | ALAND | AWATER | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17 | 031 | 843800 | 1400000US17031843800 | 17031843800 | 8438 | CT | 1309516 | 0 | POLYGON ((-87.64554 41.80886, -87.64068 41.808... |
| 2 | 17 | 031 | 243000 | 1400000US17031243000 | 17031243000 | 2430 | CT | 324548 | 0 | POLYGON ((-87.68195 41.89583, -87.67950 41.895... |
| 3 | 17 | 031 | 250600 | 1400000US17031250600 | 17031250600 | 2506 | CT | 647765 | 0 | POLYGON ((-87.77560 41.90925, -87.77536 41.909... |
| 4 | 17 | 031 | 251700 | 1400000US17031251700 | 17031251700 | 2517 | CT | 486655 | 0 | POLYGON ((-87.74826 41.89498, -87.74645 41.895... |
| 5 | 17 | 031 | 260400 | 1400000US17031260400 | 17031260400 | 2604 | CT | 328225 | 0 | POLYGON ((-87.74061 41.88781, -87.73571 41.887... |
pickup_map = folium.Map(location = [41.8,-87.6])
pickup_map
folium.Choropleth(
geo_data=geodata,
name="choropleth",
data=pickups,
columns=["GEOID","Pickups"],
threshold_scale = [0,50000,100000,150000,200000,300000,500000,750000,1000000, 1200000],
key_on="feature.properties.GEOID",
fill_color="BuPu",
fill_opacity=0.7,
line_opacity=.1,
legend_name="Pickups by Census Tract",
).add_to(pickup_map)
folium.LayerControl().add_to(pickup_map)
pickup_map
built_environment = pd.read_csv("built-environment-demographics.csv")
trip_info = pd.merge(trip_info, pickups)
# main_df = pd.merge(built_environment, trip_info)
main_df = pd.merge(trip_info, built_environment)
main_df
| GEOID | Trip_Miles | Trip_Seconds | Pickups | Unnamed: 0 | Unnamed: 0.1 | MedianIncome | Pickup | TotalPopulation | Population_Density | Employment_Density | Percent_Zero_Car_Ownership | LandUse_Diversity | Distance_from_transit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17031010201 | 6.450781 | 1080.175410 | 28499 | 0 | 0 | 41125 | 1837.0 | 7039 | 59.119022 | 1.490412 | 0.019724 | 0.069242 | 234.2500 |
| 1 | 17031010202 | 4.742800 | 882.345516 | 23203 | 1 | 1 | 45236 | 1375.0 | 2852 | 36.278590 | 8.568341 | 0.027317 | 0.724295 | 193.7950 |
| 2 | 17031010300 | 6.869025 | 1144.760704 | 28471 | 2 | 2 | 72917 | 1627.0 | 6650 | 58.470922 | 9.582295 | 0.106673 | 0.346823 | 184.4025 |
| 3 | 17031010400 | 7.030327 | 1185.853686 | 32895 | 3 | 3 | 58438 | 1204.0 | 5153 | 64.711477 | 29.071885 | 0.023679 | 2.464812 | 152.4400 |
| 4 | 17031010501 | 7.247471 | 1185.797851 | 19174 | 4 | 4 | 52747 | 1310.0 | 4147 | 84.888695 | 3.081571 | 0.032108 | 0.066411 | 113.1000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 890 | 17031843500 | 8.583506 | 1297.040576 | 4929 | 890 | 918 | 28750 | 150.0 | 10317 | 23.140485 | 6.286981 | 0.085057 | 9.501695 | 537.7900 |
| 891 | 17031843600 | 5.911914 | 956.318086 | 22670 | 891 | 919 | 67609 | 915.0 | 2918 | 28.806947 | 3.053905 | 0.114072 | 0.230467 | 264.8700 |
| 892 | 17031843700 | 5.527061 | 1057.733495 | 46532 | 892 | 920 | 194375 | 1843.0 | 2617 | 14.264697 | 16.557479 | 0.100851 | 4.656542 | 283.6450 |
| 893 | 17031843800 | 7.151539 | 1058.850033 | 1527 | 893 | 921 | 41250 | 58.0 | 1482 | 4.633875 | 0.716691 | 0.092342 | 0.372437 | 274.9300 |
| 894 | 17031843900 | 7.479913 | 1096.066109 | 20451 | 894 | 922 | 50104 | 1619.0 | 3521 | 35.433393 | 2.294889 | 0.098458 | 0.380601 | 199.0220 |
895 rows × 14 columns
main_df = main_df.drop(columns=["Unnamed: 0","Unnamed: 0.1"])
main_df
| GEOID | Trip_Miles | Trip_Seconds | Pickups | MedianIncome | Pickup | TotalPopulation | Population_Density | Employment_Density | Percent_Zero_Car_Ownership | LandUse_Diversity | Distance_from_transit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17031010201 | 6.450781 | 1080.175410 | 28499 | 41125 | 1837.0 | 7039 | 59.119022 | 1.490412 | 0.019724 | 0.069242 | 234.2500 |
| 1 | 17031010202 | 4.742800 | 882.345516 | 23203 | 45236 | 1375.0 | 2852 | 36.278590 | 8.568341 | 0.027317 | 0.724295 | 193.7950 |
| 2 | 17031010300 | 6.869025 | 1144.760704 | 28471 | 72917 | 1627.0 | 6650 | 58.470922 | 9.582295 | 0.106673 | 0.346823 | 184.4025 |
| 3 | 17031010400 | 7.030327 | 1185.853686 | 32895 | 58438 | 1204.0 | 5153 | 64.711477 | 29.071885 | 0.023679 | 2.464812 | 152.4400 |
| 4 | 17031010501 | 7.247471 | 1185.797851 | 19174 | 52747 | 1310.0 | 4147 | 84.888695 | 3.081571 | 0.032108 | 0.066411 | 113.1000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 890 | 17031843500 | 8.583506 | 1297.040576 | 4929 | 28750 | 150.0 | 10317 | 23.140485 | 6.286981 | 0.085057 | 9.501695 | 537.7900 |
| 891 | 17031843600 | 5.911914 | 956.318086 | 22670 | 67609 | 915.0 | 2918 | 28.806947 | 3.053905 | 0.114072 | 0.230467 | 264.8700 |
| 892 | 17031843700 | 5.527061 | 1057.733495 | 46532 | 194375 | 1843.0 | 2617 | 14.264697 | 16.557479 | 0.100851 | 4.656542 | 283.6450 |
| 893 | 17031843800 | 7.151539 | 1058.850033 | 1527 | 41250 | 58.0 | 1482 | 4.633875 | 0.716691 | 0.092342 | 0.372437 | 274.9300 |
| 894 | 17031843900 | 7.479913 | 1096.066109 | 20451 | 50104 | 1619.0 | 3521 | 35.433393 | 2.294889 | 0.098458 | 0.380601 | 199.0220 |
895 rows × 12 columns
main_df.columns
Index(['GEOID', 'Trip_Miles', 'Trip_Seconds', 'Pickups', 'MedianIncome',
'Pickup', 'TotalPopulation', 'Population_Density',
' Employment_Density', 'Percent_Zero_Car_Ownership',
'LandUse_Diversity', 'Distance_from_transit'],
dtype='object')
main_df = main_df[['GEOID', 'Pickups','Trip_Miles', 'Trip_Seconds', 'MedianIncome',
'TotalPopulation', 'Population_Density',
' Employment_Density', 'Percent_Zero_Car_Ownership',
'LandUse_Diversity', 'Distance_from_transit']]
import seaborn as sns
X = main_df[main_df.columns[1:12]]
X
| Pickups | Trip_Miles | Trip_Seconds | MedianIncome | TotalPopulation | Population_Density | Employment_Density | Percent_Zero_Car_Ownership | LandUse_Diversity | Distance_from_transit | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 28499 | 6.450781 | 1080.175410 | 41125 | 7039 | 59.119022 | 1.490412 | 0.019724 | 0.069242 | 234.2500 |
| 1 | 23203 | 4.742800 | 882.345516 | 45236 | 2852 | 36.278590 | 8.568341 | 0.027317 | 0.724295 | 193.7950 |
| 2 | 28471 | 6.869025 | 1144.760704 | 72917 | 6650 | 58.470922 | 9.582295 | 0.106673 | 0.346823 | 184.4025 |
| 3 | 32895 | 7.030327 | 1185.853686 | 58438 | 5153 | 64.711477 | 29.071885 | 0.023679 | 2.464812 | 152.4400 |
| 4 | 19174 | 7.247471 | 1185.797851 | 52747 | 4147 | 84.888695 | 3.081571 | 0.032108 | 0.066411 | 113.1000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 890 | 4929 | 8.583506 | 1297.040576 | 28750 | 10317 | 23.140485 | 6.286981 | 0.085057 | 9.501695 | 537.7900 |
| 891 | 22670 | 5.911914 | 956.318086 | 67609 | 2918 | 28.806947 | 3.053905 | 0.114072 | 0.230467 | 264.8700 |
| 892 | 46532 | 5.527061 | 1057.733495 | 194375 | 2617 | 14.264697 | 16.557479 | 0.100851 | 4.656542 | 283.6450 |
| 893 | 1527 | 7.151539 | 1058.850033 | 41250 | 1482 | 4.633875 | 0.716691 | 0.092342 | 0.372437 | 274.9300 |
| 894 | 20451 | 7.479913 | 1096.066109 | 50104 | 3521 | 35.433393 | 2.294889 | 0.098458 | 0.380601 | 199.0220 |
895 rows × 10 columns
X = X.dropna()
X = X[(X != 0).all(1)]
X.shape
(889, 10)
sns.pairplot(X)
<seaborn.axisgrid.PairGrid at 0x7fc848d21f40>
sns.heatmap(X.corr())
<AxesSubplot:>
X.describe()
| Pickups | Trip_Miles | Trip_Seconds | MedianIncome | TotalPopulation | Population_Density | Employment_Density | Percent_Zero_Car_Ownership | LandUse_Diversity | Distance_from_transit | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 8.890000e+02 | 889.000000 | 889.000000 | 889.000000 | 889.000000 | 889.000000 | 889.000000 | 889.000000 | 889.000000 | 889.000000 |
| mean | 2.698930e+04 | 6.896870 | 1098.841019 | 84220.039370 | 3679.753656 | 31.174132 | 11.469629 | 0.208842 | 1.535303 | 279.243646 |
| std | 8.115247e+04 | 2.642166 | 230.613680 | 54337.144703 | 1832.146235 | 27.338535 | 49.842658 | 0.156357 | 7.612212 | 112.575014 |
| min | 2.000000e+00 | 3.322565 | 688.884793 | 11457.000000 | 416.000000 | 2.139127 | 0.027023 | 0.002972 | 0.004566 | 23.470000 |
| 25% | 1.532000e+03 | 5.378845 | 954.960748 | 44911.000000 | 2279.000000 | 15.517109 | 1.572308 | 0.076892 | 0.184376 | 211.900000 |
| 50% | 5.553000e+03 | 6.527606 | 1065.800529 | 67000.000000 | 3485.000000 | 25.432728 | 3.351064 | 0.172012 | 0.401153 | 257.493333 |
| 75% | 1.846000e+04 | 7.515382 | 1194.269912 | 107267.000000 | 4861.000000 | 38.537783 | 7.697686 | 0.304828 | 0.950401 | 327.900000 |
| max | 1.163394e+06 | 37.400000 | 2655.400000 | 250000.000000 | 19889.000000 | 407.584189 | 1102.545869 | 0.740122 | 176.285714 | 1062.170000 |
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(X)
clusters = kmeans.predict(X)
cluster_centers = pd.DataFrame(kmeans.cluster_centers_)
cluster_centers.columns = X.columns.values
cluster_centers
| Pickups | Trip_Miles | Trip_Seconds | MedianIncome | TotalPopulation | Population_Density | Employment_Density | Percent_Zero_Car_Ownership | LandUse_Diversity | Distance_from_transit | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8348.449929 | 7.248445 | 1131.588034 | 61194.454161 | 3672.187588 | 27.764839 | 4.392629 | 0.207883 | 0.936670 | 285.514218 |
| 1 | 644577.600000 | 4.490997 | 805.811809 | 185783.900000 | 8360.400000 | 72.685729 | 297.063445 | 0.339215 | 14.438450 | 173.048917 |
| 2 | 68403.870588 | 5.572115 | 979.503715 | 174275.929412 | 3435.976471 | 42.951031 | 24.185244 | 0.205175 | 3.272948 | 259.338417 |
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import dendrogram
from pandas.plotting import parallel_coordinates
palette = sns.color_palette("bright", 10)
def addAlpha(colour, alpha):
'''Add an alpha to the RGB colour'''
return (colour[0],colour[1],colour[2],alpha)
def display_factorial_planes(X_projected, n_comp, pca, axis_ranks, labels=None, alpha=1, illustrative_var=None):
'''Display a scatter plot on a factorial plane, one for each factorial plane'''
# For each factorial plane
for d1,d2 in axis_ranks:
if d2 < n_comp:
# Initialise the matplotlib figure
fig = plt.figure(figsize=(7,6))
# Display the points
if illustrative_var is None:
plt.scatter(X_projected[:, d1], X_projected[:, d2], alpha=alpha)
else:
illustrative_var = np.array(illustrative_var)
for value in np.unique(illustrative_var):
selected = np.where(illustrative_var == value)
plt.scatter(X_projected[selected, d1], X_projected[selected, d2], alpha=alpha, label=value)
plt.legend()
# Display the labels on the points
if labels is not None:
for i,(x,y) in enumerate(X_projected[:,[d1,d2]]):
plt.text(x, y, labels[i],
fontsize='14', ha='center',va='center')
# Define the limits of the chart
boundary = np.max(np.abs(X_projected[:, [d1,d2]])) * 1.1
plt.xlim([-boundary,boundary])
plt.ylim([-boundary,boundary])
# Display grid lines
plt.plot([-100, 100], [0, 0], color='grey', ls='--')
plt.plot([0, 0], [-100, 100], color='grey', ls='--')
# Label the axes, with the percentage of variance explained
plt.xlabel('PC{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
plt.ylabel('PC{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))
plt.title("Projection of points (on PC{} and PC{})".format(d1+1, d2+1))
#plt.show(block=False)
def display_parallel_coordinates(df, num_clusters):
'''Display a parallel coordinates plot for the clusters in df'''
# Select data points for individual clusters
cluster_points = []
for i in range(num_clusters):
cluster_points.append(df[df.cluster==i])
# Create the plot
fig = plt.figure(figsize=(12, 15))
title = fig.suptitle("Parallel Coordinates Plot for the Clusters", fontsize=18)
fig.subplots_adjust(top=0.95, wspace=0)
# Display one plot for each cluster, with the lines for the main cluster appearing over the lines for the other clusters
for i in range(num_clusters):
plt.subplot(num_clusters, 1, i+1)
for j,c in enumerate(cluster_points):
if i!= j:
pc = parallel_coordinates(c, 'cluster', color=[addAlpha(palette[j],0.2)])
pc = parallel_coordinates(cluster_points[i], 'cluster', color=[addAlpha(palette[i],0.5)])
# Stagger the axes
ax=plt.gca()
for tick in ax.xaxis.get_major_ticks()[1::2]:
tick.set_pad(20)
def display_parallel_coordinates_centroids(df, num_clusters):
'''Display a parallel coordinates plot for the centroids in df'''
# Create the plot
fig = plt.figure(figsize=(12, 5))
title = fig.suptitle("Parallel Coordinates plot for the Centroids", fontsize=18)
fig.subplots_adjust(top=0.9, wspace=0)
# Draw the chart
parallel_coordinates(df, 'cluster', color=palette)
# Stagger the axes
ax=plt.gca()
for tick in ax.xaxis.get_major_ticks()[1::2]:
tick.set_pad(20)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc_X = sc.fit_transform(X)
sc_X = pd.DataFrame(sc_X)
sc_X
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.018614 | -0.168930 | -0.080984 | -0.793551 | 1.834535 | 1.022755 | -0.200327 | -1.210209 | -0.192702 | -0.399902 |
| 1 | -0.046683 | -0.815725 | -0.939308 | -0.717851 | -0.452049 | 0.186818 | -0.058242 | -1.161620 | -0.106600 | -0.759465 |
| 2 | 0.018269 | -0.010545 | 0.199232 | -0.208134 | 1.622097 | 0.999035 | -0.037887 | -0.653805 | -0.156216 | -0.842945 |
| 3 | 0.072814 | 0.050539 | 0.377522 | -0.474750 | 0.804562 | 1.227433 | 0.353355 | -1.184900 | 0.122176 | -1.127026 |
| 4 | -0.096358 | 0.132769 | 0.377279 | -0.579544 | 0.255170 | 1.965899 | -0.168385 | -1.130961 | -0.193074 | -1.476679 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 884 | -0.271991 | 0.638713 | 0.859928 | -1.021424 | 3.624701 | -0.294023 | -0.104039 | -0.792128 | 1.047117 | 2.297951 |
| 885 | -0.053254 | -0.372993 | -0.618364 | -0.305875 | -0.416005 | -0.086637 | -0.168941 | -0.606457 | -0.171510 | -0.127752 |
| 886 | 0.240950 | -0.518734 | -0.178353 | 2.028391 | -0.580386 | -0.618868 | 0.102136 | -0.691061 | 0.410261 | 0.039119 |
| 887 | -0.313935 | 0.096441 | -0.173509 | -0.791249 | -1.200227 | -0.971347 | -0.215859 | -0.745509 | -0.152849 | -0.038340 |
| 888 | -0.080613 | 0.220793 | -0.012039 | -0.628212 | -0.086698 | 0.155885 | -0.184178 | -0.706372 | -0.151776 | -0.713007 |
889 rows × 10 columns
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from scipy.cluster.hierarchy import dendrogram
from pandas.plotting import parallel_coordinates
X_scaled_clustered = pd.DataFrame(sc_X, columns=sc_X.columns, index=sc_X.index)
X_scaled_clustered['cluster'] = clusters
X_scaled_clustered.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.018614 | -0.168930 | -0.080984 | -0.793551 | 1.834535 | 1.022755 | -0.200327 | -1.210209 | -0.192702 | -0.399902 | 0 |
| 1 | -0.046683 | -0.815725 | -0.939308 | -0.717851 | -0.452049 | 0.186818 | -0.058242 | -1.161620 | -0.106600 | -0.759465 | 0 |
| 2 | 0.018269 | -0.010545 | 0.199232 | -0.208134 | 1.622097 | 0.999035 | -0.037887 | -0.653805 | -0.156216 | -0.842945 | 0 |
| 3 | 0.072814 | 0.050539 | 0.377522 | -0.474750 | 0.804562 | 1.227433 | 0.353355 | -1.184900 | 0.122176 | -1.127026 | 0 |
| 4 | -0.096358 | 0.132769 | 0.377279 | -0.579544 | 0.255170 | 1.965899 | -0.168385 | -1.130961 | -0.193074 | -1.476679 | 0 |
from sklearn.decomposition import PCA
# Create a PCA model to reduce our data to 2 dimensions for visualisation
pca = PCA(n_components=2)
pca.fit(sc_X)
# Transfor the scaled data to the new PCA space
X_reduced = pca.transform(sc_X)
# Convert to a data frame
X_reduceddf = pd.DataFrame(X_reduced, index=X.index, columns=['PC1','PC2'])
X_reduceddf['cluster'] = clusters
X_reduceddf.head()
| PC1 | PC2 | cluster | |
|---|---|---|---|
| 0 | -0.057504 | 0.243946 | 0 |
| 1 | 0.430891 | -1.094114 | 0 |
| 2 | 0.193360 | 0.337670 | 0 |
| 3 | 0.296173 | 0.362192 | 0 |
| 4 | 0.250216 | -0.355774 | 0 |
display_factorial_planes(X_reduced, 2, pca, [(0,1)], illustrative_var = clusters, alpha = 0.8)
# Add the cluster number to the original scaled data
X_clustered = pd.DataFrame(sc_X, index=sc_X.index, columns=sc_X.columns)
X_clustered["cluster"] = clusters
# Display parallel coordinates plots, one for each cluster
display_parallel_coordinates(X_clustered, 3)
# Create a data frame containing our centroids
centroids = pd.DataFrame(sc.fit_transform(kmeans.cluster_centers_), columns=X.columns)
centroids['cluster'] = centroids.index
display_parallel_coordinates_centroids(centroids, 10)
cluster_distribution = X_reduceddf["cluster"].value_counts().to_frame().reset_index()
cluster_distribution
| index | cluster | |
|---|---|---|
| 0 | 0 | 709 |
| 1 | 2 | 170 |
| 2 | 1 | 10 |
cluster_distribution.columns = ["cluster","number"]
cluster_distribution
| cluster | number | |
|---|---|---|
| 0 | 0 | 709 |
| 1 | 2 | 170 |
| 2 | 1 | 10 |
plt.pie(cluster_distribution["number"], labels = ["Cluster 0","Cluster 2","Cluster 1"], shadow=True)
([<matplotlib.patches.Wedge at 0x7fc8221529a0>, <matplotlib.patches.Wedge at 0x7fc8221631c0>, <matplotlib.patches.Wedge at 0x7fc822163910>], [Text(-0.8848651129616366, 0.653462877188437, 'Cluster 0'), Text(0.861225012024064, -0.6843182583156402, 'Cluster 2'), Text(1.0993132202631353, -0.0388644278832949, 'Cluster 1')])